# -*- coding: utf-8 -*-
# Python 3.8.3
"""
丹弗美术馆中国藏品爬虫
https://www.denverartmuseum.org/en/search/collections?search_api_fulltext=China#search-results
"""
import requests
from lxml import etree
import json

doc_path = 'denver.txt'
img_path = 'img_denver.txt'

img_url_list = []
for line in open(img_path):
    img_url_list.append(str(line).strip())

count = 0
for line in open(doc_path):
    url = str(line).strip()
    html = etree.HTML(requests.get(url).text)

    print(str(count)+url)
    d = {}
    d["url"] = url
    title = html.xpath('//*[@id="block-damweb-content"]/article/div[1]/div[2]/h1/text()')[0]
    d["title"] = str(title)
    date = html.xpath('//*[@id="block-damweb-content"]/article/div[1]/div[2]/div[1]/text()')[0]
    d["date"] = str(date)
    keywords = html.xpath('//*[@id="block-damweb-content"]/article/div[1]/div[2]/div[last()]/div/text()')
    d["keywords"] = str(keywords)

    # 右上标题
    titles_1 = html.xpath('//*[@id="block-damweb-content"]/article/div[1]/div[2]/div[position()<last()]/div[1]/text()')
    index = 2
    for a in titles_1:
        if a == 'Artist':
            detail = html.xpath('//*[@class="maker-list"]/dt/text()')
            detail.extend(html.xpath('//*[@class="maker-list"]/dd/text()'))
            d["Artist"] = detail
        elif a == 'Locale':
            detail = html.xpath('//*[@id="block-damweb-content"]/article/div[1]/div[2]/div[{}]/div[2]/text()'.format(str(index)))
            d["Locale"] = detail
        elif a == 'Country':
            detail = html.xpath('//*[@id="block-damweb-content"]/article/div[1]/div[2]/div[{}]/text()'.format(str(index)))
            d["Country"] = detail
        index += 1

    # 左下标题
    titles_2 = html.xpath('//*[@id="block-damweb-content"]/article/div[2]/div[1]/div/div/text()')
    size = len(html.xpath('//*[@id="block-damweb-content"]/article/div[2]/div[1]/div'))
    
    for i in range(1, size):
        str_ = html.xpath('//*[@id="block-damweb-content"]/article/div[2]/div[1]/div[{}]/div/text()'.format(str(i)))
        if len(str_)>0:
            if str_[0]=='Dimensions':
                pos = i
                break
            elif str_[0]=='Inscription':
                pos = i
                break
            elif str_[0]=='Department':
                pos = i
                break
            elif str_[0]=='Collection':
                pos = i
                break
            elif str_[0]=='Edition':
                pos = i
                break
    detail_2 = []
    for i in range(pos, size+1):
        detail_2.append(html.xpath('//*[@id="block-damweb-content"]/article/div[2]/div[1]/div[{}]/text()'.format(str(i))))

    ll = 0
    for title in titles_2:
        if title.isalpha():
            d[title] = detail_2[ll]
            ll += 1
    
    # 左下文字段
    paragraph_1 = html.xpath('//*[@id="block-damweb-content"]/article/div[2]/div[1]/div[1]/p/text()')
    if(len(paragraph_1) == 0):
        d["paragraph_1"] = None
    else:
        d["paragraph_1"] = str(paragraph_1)
    # 右下文字段
    paragraph_2 = html.xpath('//*[@id="block-damweb-content"]/article/div[2]/div[2]/div[1]/p/text()')
    if(len(paragraph_2) == 0):
        d["paragraph_2"] = None
    else:
        d["paragraph_2"] = str(paragraph_2)

    d["img_url"] = img_url_list[count]
    count += 1
    # print(d)
    with open('denver.json', 'a') as f:
        f.write("\""+str(count)+"\":"+json.dumps(d)+",")

